%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% This file is part of the book
%%
%% Probability and Its Applications
%% http://code.google.com/p/probability-book/
%%
%% Copyright (C) 2010 Minh Van Nguyen <nguyenminh2@gmail.com>
%%
%% See the file COPYING for copying conditions.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\chapter{The Probabilistic Method}

The probabilistic method is a technique for proving the existence of
an object. To do so, we first demonstrate a sample space of objects
such that the probability $p$ of a randomly selected object having the
desired property is $p > 0$. If the probability $p$ is positive, then
the sample space in question must contain an object with the required
property. In this way, we demonstrate the existence of an object
possessing some desired property.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Basic counting argument}

Let $K_n$ be the complete graph on $n$ vertices. If $C$ is a clique of
$k$ vertices in $K_n$, then $C$ is the complete subgraph $K_k$. Notice
that $K_n$ has $\binom{n}{2}$ edges.

\begin{theorem}
If
\[
\binom{n}{2} \cdot 2^{-\binom{k}{2} + 1}
<
1
\]
then it is possible to color the edges of $K_n$ with two colors such
that $K_n$ has no monochromatic $K_k$ subgraphs.
\end{theorem}

\begin{proof}
Let $S$ be a sample space consisting of all the possible colorings of
the edges of $K_n$ using two colors. There are $2^{\binom{n}{2}}$
possible colorings of $K_n$ using two colors, so $S$ has cardinality
$2^{\binom{n}{2}}$. The probability of choosing a coloring from $S$
uniformly at random is given by
\[
\frac{1}{2^{\binom{n}{2}}}.
\]
We have $\binom{n}{k}$ different $k$-vertex cliques in $K_n$. Fix an
arbitrary ordering of these $\binom{n}{k}$ different $k$-vertex
cliques in $K_n$. Let $A_i$ be the event that clique $i$ is
monochromatic, for $i = 1, \dots, \binom{n}{k}$. For a given clique
$i$, once the first edge is colored then the remaining
$\binom{k}{2} - 1$ edges must be colored with the same color. Thus
\[
\Pr(A_i)
=
2^{-\binom{k}{2} + 1}.
\]
Use the assumptions of the theorem and a union bound to get
%%
\begin{align*}
\Pr\left(\bigcup_{i=1}^{\binom{n}{k}} A_i\right)
&\leq
\sum_{i=1}^{\binom{n}{k}} \Pr(A_i) \\[4pt]
&=
\binom{n}{k} \cdot 2^{-\binom{k}{2} + 1} \\[4pt]
&<
1
\end{align*}
%%
and hence
%%
\begin{align*}
\Pr\left(\bigcap_{i=1}^{\binom{n}{k}} \overline{A_i}\right)
&=
1 - \Pr\left(\bigcup_{i=1}^{\binom{n}{k}} A_i\right) \\[4pt]
&>
0.
\end{align*}
%%
This means that we have a positive probability of choosing from our
sample space a coloring with no monochromatic $k$-vertex
cliques. Conclude that there exists a coloring with no monochromatic
$k$-vertex cliques.
\end{proof}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Expectation argument}

Sometimes it is easier to use an averaging argument to prove the
existence of an object having some required properties. The intuition
behind this approach is that in a discrete probability space, there is
a positive probability that a random variable assumes at least one
value less than or equal to its expectation and at least one value
greater than or equal to its expectation.

\begin{lemma}
\label{lem:probabilistic_method:at_least_one_value_above_below_mean}
Let $S$ be a sample space and let $X$ be a random variable defined on
$S$ such that $\E[X] = \mu$. Then $\Pr(X \geq \mu) > 0$ and
$\Pr(X \leq \mu) > 0$.
\end{lemma}

\begin{proof}
By definition we have
\[
\mu
=
\E[X]
=
\sum_x x \cdot \Pr(X = x)
\]
summing over all values in the range of $X$. If $\Pr(X \geq \mu) = 0$,
then we have the contradiction
%%
\begin{align*}
\mu
&=
\sum_x x \cdot \Pr(X = x) \\[4pt]
&=
\sum_{x < \mu} x \cdot \Pr(X = x) \\[4pt]
&<
\sum_{x < \mu} \mu \cdot \Pr(X = x) \\[4pt]
&=
\mu.
\end{align*}
%%
Similarly, if $\Pr(X \leq \mu) = 0$, then we have another
contradiction:
%%
\begin{align*}
\mu
&=
\sum_x x \cdot \Pr(X = x) \\[4pt]
&=
\sum_{x > \mu} x \cdot \Pr(X = x) \\[4pt]
&>
\sum_{x > \mu} \mu \cdot \Pr(X = x) \\[4pt]
&=
\mu.
\end{align*}
%%
Thus the theorem holds.
\end{proof}

As another application of the probabilistic method, we want to find a
large cut in an undirected graph. A cut of a graph is a partition of
vertices into two disjoint sets. The value of a cut is defined as the
weight of all edges crossing from one partition to the other. For
simplicity, we consider the case where each edge has weight $1$.

\begin{theorem}
\label{thm:probabilistic_method:graph_cut_of_value_at_least_m_half}
Let $G = (V, E)$ be an undirected graph of order $|V| = n$ and size
$|E| = m$. We can partition the vertex set $V$ into two disjoint sets
$A$ and $B$ such that at least $m/2$ edges connect a vertex in $A$ to
a vertex in $B$. In other words, there is a cut with value
$\geq m/2$.
\end{theorem}

\begin{proof}
Construct the two sets $A$ and $B$ by randomly and independently
assigning each vertex to these sets. Consider an arbitrary enumeration
$e_1, \dots, e_m$ of the edges of $G$ and for $i = 1, \dots, m$ define
$X_i$ to be
\[
X_i
=
\begin{cases}
1, & \text{if edge $i$ connects $A$ to $B$,} \\
0, & \text{otherwise.}
\end{cases}
\]
There is a probability of $1/2$ that an edge connects a vertex in $A$
to a vertex in $B$ and therefore $\E[X_i] = 1/2$. Let $C(A,B)$ be a
random variable denoting the value of the cut. As each edge has unit
weight, use the definition of $X_i$ and the linearity of expectations
to get
%%
\begin{align*}
\E[C(A,B)]
&=
\E\left[\sum_{i=1}^m X_i\right] \\[4pt]
&=
\sum_{i=1}^m \E[X_i] \\[4pt]
&=
m \cdot \frac{1}{2} \\[4pt]
&=
\frac{m}{2}.
\end{align*}
%%
Conclude by
Lemma~\ref{lem:probabilistic_method:at_least_one_value_above_below_mean}
that there exists a partition $A, B$ with at least $m/2$ edges
connecting vertices in $A$ to vertices in $B$.
\end{proof}

We now consider the maximum satisfiability~(MAXSAT) problem. A literal
in a logical formula is a boolean variable or its negation. If $x$ is
a boolean variable, its negation is denoted $\overline{x}$. A
satisfiability~(SAT) formula is a logical expression comprised of a
conjunction of clauses, each clause being a disjunction of
literals. The SAT problem asks for an assignment of boolean values to
the literals of a SAT formula such that all the clauses are
satisfied. The problem essentially asks for an assignment of boolean
values to the literals so that each clause evaluates to true and
therefore the SAT formula evaluates to true. A related goal is to
satisfy as many of the clauses as possible.

\begin{theorem}
Consider a collection of $m$ clauses none of which contains both a
variable and its complement. For $i = 1, \dots, m$ let $k_i$ be the
number of literals in the $i$-th clause. Then there is a truth
assignment satisfying at least
\[
\sum_{i=1}^m \big(1 - 2^{-k_i}\big)
\geq
m\big(1 - 2^{-k}\big)
\]
clauses, where $k = \min_{i=1}^m \{k_i\}$.
\end{theorem}

\begin{proof}
Let boolean values be assigned independently and uniformly at random
to the variables. Clause $i$ is false if and only if each of its $k_i$
literals is false, and is true if at least one of the $k_i$ literals
is true. Of all the possible $2^{k_i}$ truth assignments for clause
$i$, only one truth assignment results in a false clause and each of
the remaining $2^{k_i} - 1$ truth assignments results in a true
clause. The probability that the $i$-th clause is satisfied is
$\geq 1 - 2^{-k_i}$. Let $X_i$ be a random variable defined by
\[
X_i
=
\begin{cases}
1, & \text{if clause $i$ is satisfied,} \\
0, & \text{otherwise.}
\end{cases}
\]
Then $\E[X_i] \geq 1 - 2^{-k_i}$ and the expected number of satisfied
clauses is at least
%%
\begin{align*}
\E\left[\sum_i X_i\right]
&=
\sum_i \E[X_i] \\[4pt]
&\geq
\sum_i \big(1 - 2^{-k_i}\big) \\[4pt]
&\geq
m\big(1 - 2^{-k}\big).
\end{align*}
%%
Conclude by
Lemma~\ref{lem:probabilistic_method:at_least_one_value_above_below_mean}
that there is a truth assignment that satisfies at least
\[
\sum_i \big(1 - 2^{-k_i}\big)
\geq
m\big(1 - 2^{-k}\big)
\]
clauses.
\end{proof}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Derandomization using conditional expectations}

Theorem~\ref{thm:probabilistic_method:graph_cut_of_value_at_least_m_half}
and its proof describe a probabilistic argument for obtaining a cut of
a graph whose expected value is at least $m/2$, where $m$ is the
graph's size. Instead of allocating vertices independently and
uniformly at random to either of the two sets $A$ and $B$, now
consider a deterministic allocation. The vertices of an undirected
graph of order $n$ are deterministically placed, one at a time, in an
arbitrary order $v_1, \dots, v_n$. Denote by $x_i$ the set in which
$v_i$ is placed, so we either have $x_i = A$ or $x_i = B$. Let
$x_1, \dots, x_k$ be the positions of the first $k$ vertices
$v_1, \dots, v_k$, i.e. we have placed the sequence of vertices
$v_1, \dots, v_k$ in positions $x_1, \dots, x_k$. We want to know the
expected value of the cut if each of the remaining vertices is placed
uniformly and independently into either of two sets. Write
\[
\E[C(A,B) \mid x_1, \dots, x_k]
\]
as the conditional expectation of the value of the cut given that the
first $k$ vertices have been placed in positions $x_1, \dots, x_k$,
respectively. How can we place the next remaining vertex $v_{k+1}$ so
as to increase~(or at least not to decrease) the expected value of the
cut? In other words, we want a placement of $v_{k+1}$ such that
\[
\E[C(A,B) \mid x_1, \dots, x_k]
\leq
\E[C(A,B) \mid x_1, \dots, x_k, x_{k+1}].
\]

We can argue by induction on the positions of the vertices. The base
case
\[
\E[C(A,B) \mid x_1]
=
\E[C(A,B)]
\]
holds because it is irrelevant where we place the first vertex. For
the inductive case, let $v_1, \dots, v_k$ be placed in positions
$x_1, \dots, x_k$. We now place $v_{k+1}$ uniformly at random into
either $A$ or $B$. Denote by $Y_{k+1}$ the random variable
representing the set where $v_{k+1}$ is placed. Then we have
%%
\begin{align*}
& \E[C(A,B) \mid x_1, \dots, x_k] \\
&=
\frac{1}{2} \cdot \E[C(A,B) \mid x_1, \dots, x_k, Y_{k+1} = A]
+ \frac{1}{2} \cdot \E[C(A,B) \mid x_1, \dots, x_k, Y_{k+1} = B]
\end{align*}
%%
and hence
%%
\begin{align*}
& \E[C(A,B) \mid x_1, \dots, x_k] \\
&\leq
\max\big\{
\E[C(A,B) \mid x_1, \dots, x_k, Y_{k+1} = A],\;
\E[C(A,B) \mid x_1, \dots, x_k, Y_{k+1} = B]
\big\}.
\end{align*}
%%
That is, we place $v_{k+1}$ in the set that yields the larger expected
cut value and we would have satisfied the condition
\[
\E[C(A,B) \mid x_1, \dots, x_k]
\leq
\E[C(A,B) \mid x_1, \dots, x_{k+1}].
\]

From the above derandomization process, we obtain a simple greedy
algorithm as follows. Consider the vertices in some arbitrary
order. Place the first vertex either in $A$ or in $B$; it does not
matter in which set. Place each successive vertex in a set that
maximizes the cut value, i.e. maximizing the number of edges crossing
between $A$ and $B$.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Sample and modify}

The method of sample and modify works by first generating a random
structure without the required properties. We then gradually modify
that random structure in order to obtain a structure having the
desired properties. One application of the sample and modify technique
is in computing a bound on the size of the largest independent set of
a graph. Recall that an independent set of a graph $G = (V,E)$ is a
vertex subset $V' \subseteq V$ where no pair of vertices in $V'$ is an
edge.

\begin{theorem}
Consider a graph $G = (V,E)$ with $n$ vertices and $m$ edges. Then $G$
has an independent set $V'$ such that $|V'| \geq n^2 / 4m$.
\end{theorem}

\begin{proof}
The total degree of $G$ is $2m$ and its average degree is
$d = 2m / n$. Consider the following process:
%%
\begin{enumerate}
\item Each vertex $v \in V$ together with its incident edges are
  removed independently from $G$ with probability $1 - 1/d$. The
  result of the vertex deletion is a subgraph $G'$.

\item Remove each edge $e$ in $G'$ and exactly one of the vertices of
  $e$. Again the vertex removal probability is $1 - 1/d$. This results
  in an independent set because all edges have been removed.
\end{enumerate}
%%
Define a random variable $X_i$ by
\[
X_i
=
\begin{cases}
1, & \text{if $v_i$ survives step~1,} \\
0, & \text{otherwise}
\end{cases}
\]
and let $X = \sum_i X_i$. Each vertex $v_i$ survives with probability
$1/d$, hence $\E[X_i] = 1/d$. The expected number of vertices that
survive step~1 is
\[
\E[X]
=
\E\left[\sum_i X_i\right]
=
\sum_i \E[X_i]
=
n/d.
\]
The original graph $G$ has a total of $m = nd / 2$ edges. Each edge
survives step~1 if and only if its adjacent vertices survive,
i.e. each edge survives with probability $(1/d)^2$. Define a random
variable $Y_i$ by
\[
Y_i
=
\begin{cases}
1, & \text{if edge $e_i$ survives,} \\
0, & \text{otherwise}
\end{cases}
\]
and write $Y = \sum_i Y_i$. Use an argument similar to how we derive
$\E[X]$ to conclude that the expected number of edges surviving step~1
is
\[
\E[Y]
=
\E\left[\sum_i Y_i\right]
=
\sum_i \E[Y_i]
=
\frac{nd}{2} \cdot \left(\frac{1}{d}\right)^2
=
\frac{n}{2d}.
\]
After the second step, we would have removed at most $Y$ vertices and
produced an independent set with at least $X - Y$ vertices. Then the
expected number of vertices in the resulting independent set is
\[
\E[X - Y]
=
\frac{n}{d} - \frac{n}{2d}
=
\frac{n}{2d}
=
\frac{n^2}{4m}
\]
and therefore the algorithm terminates with an independent set of
$\geq n^2 / 4m$ vertices.
\end{proof}

As another application of the sample-and-modify technique, we provide
a bound on the girth of a graph. Recall that the girth of a graph is
the length of the graph's smallest cycle. Before deriving the bound,
we first discuss random graph models.

Two closely related models of random graphs are $G_{n,p}$ and
$G_{n,N}$. The model $G_{n,p}$ considers the collection of all
undirected graphs on $n$ vertices, each such graph having at most
$\binom{n}{2}$ edges, $m$ actual edges, and an associated
probability
\[
p^m (1 - p)^{\binom{n}{2} - m}.
\]
Notice the resemblance to the binomial distribution. To generate a
random graph in $G_{n,p}$, consider each of the $\binom{n}{2}$
possible edges in some order and add it independently to our graph
with probability $p$. The expected number of edges in the resulting
graph is
\[
p \cdot \binom{n}{2}
=
\frac{p \cdot n!} {2! (n - 2)!}
\]
and the expected total degree is
\[
2p \cdot \binom{n}{2}
=
pn(n - 1).
\]
Then the expected degree of each edge is $p(n - 1)$.

Like the model $G_{n,p}$, in the model $G_{n,N}$ we consider all
undirected graphs on $n$ vertices. However, each such graph in
$G_{n,N}$ has exactly $N$ edges, hence $G_{n,N}$ is a collection of
$\binom{\binom{n}{2}} {N}$ graphs on exactly $N$ edges, each of which
is selected with equal probability. To generate a graph in $G_{n,N}$,
choose $N$ of the possible $\binom{n}{2}$ edges independently and
uniformly at random.

\begin{theorem}
Consider an integer $k \geq 3$. Then there is a graph with $n$
vertices, $\geq \frac{1}{4} n^{1 + 1/k}$ edges, and girth $\geq k$.
\end{theorem}

\begin{proof}
Sample a random graph $G \in G_{n,p}$ with probability
$p = n^{1/k - 1}$. Define a random variable $X_i$ by
\[
X_i
=
\begin{cases}
1, & \text{if edge $e_i$ is chosen,} \\
0, & \text{otherwise}
\end{cases}
\]
and let $X = \sum_i X_i$. Then the expected number of edges in $G$ is
\[
\E[X]
=
\binom{n}{2} p
=
\frac{1}{2} n^{1/k + 1} \left(1 - \frac{1}{n}\right).
\]
Each cycle of length $Y$ occurs with probability $p^i$ and there are
\[
\binom{n}{i} \frac{(i - 1)!}{2},
\qquad
3 \leq i \leq k - 1
\]
possible cycles of length $i$. Define a random variable $Y_i$ by
\[
Y_i
=
\begin{cases}
1, & \text{if cycle $i$ is chosen,} \\
0, & \text{otherwise}
\end{cases}
\]
and let $Y = \sum_{i=3}^{k-1} Y_i$. Then the expected number of cycles
of length $\leq k - 1$ is
%%
\begin{align*}
\E[Y]
&=
\sum_{i=3}^{k-1} \binom{n}{i} \frac{(i - 1)!}{2} p^i \\[4pt]
&\leq
\sum_{i=3}^{k-1} n^i p^i \\[4pt]
&=
\sum_{i=3}^{k-1} n^{i/k} \\[4pt]
&<
kn^{(k-1) / k}.
\end{align*}
%%
Modify $G$ by removing an edge from each cycle of length
$\leq k - 1$. The resulting graph has girth $\geq k$. For sufficiently
large $n$, the expected number of edges in the modified graph is
%%
\begin{align*}
\E[X - Y]
&\geq
\frac{1}{2} n^{1/k + 1} \left(1 - \frac{1}{n}\right) - kn^{(k-1) / k} \\[4pt]
&\geq
\frac{1}{4} n^{1/k + 1}
\end{align*}
%%
and the theorem follows.
\end{proof}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Second moment method}

In addition to the method of sample and modify, the second moment
method serves as another useful application of the probabilistic
method. The standard approach when using the second moment method is
to use the following inequality.

\begin{theorem}
\label{thm:probabilistic_method:upper_bound_nonnegative_integer_rand_var}
Given a random variable $X$ that assumes only nonnegative integer
values, we have
\[
\Pr(X = 0)
\leq
\frac{\Var[X]} {\big(\E[X]\big)^2}.
\]
\end{theorem}

\begin{proof}
Apply Chebyshev's inequality to produce
%%
\begin{align*}
\Pr(X = 0)
&\leq
\Pr\big(|X - \E[X]| \geq \E[X]\big) \\[4pt]
&\leq
\frac{\Var[X]} {\big(\E[X]\big)^2}
\end{align*}
%%
as required.
\end{proof}

\begin{lemma}
\label{lem:probabilistic_method:upper_bound_var_with_expectation_covar}
Let $Y_1, \dots, Y_m$ be $0$-$1$ random variables and write
$Y = \sum_i Y_i$. Then
\[
\Var[Y]
\leq
\E[Y] + \sum_{\substack{1 \leq i,j \leq m \\ i \neq j}} \Cov(Y_i,\, Y_j).
\]
\end{lemma}

\begin{proof}
Generalizing
Theorem~\ref{thm:moments:variance_of_sum_of_random_variables} to a
finite collection of random variables, we have
%%
\begin{align*}
\Var\left[\sum_i Y_i\right]
&=
\sum_i \Var[Y_i] + 2 \sum_{i=1}^m \sum_{j>i} \Cov(Y_i,\, Y_j) \\[4pt]
&=
\sum_i \Var[Y_i]
+ \sum_{\substack{1 \leq i,j \leq m \\ i \neq j}} \Cov(Y_i,\, Y_j).
\end{align*}
%%
As each $Y_i$ is a $0$-$1$ random variable, then
$\E[Y_i^2] = \E[Y_i]$ and therefore
%%
\begin{align*}
\Var[Y_i]
&=
\E[Y_i^2] - \big(\E[Y_i]\big)^2 \\
&\leq
\E[Y_i]
\end{align*}
%%
which proves the lemma.
\end{proof}

We now use the second moment method and
Lemma~\ref{lem:probabilistic_method:upper_bound_var_with_expectation_covar}
to prove a threshold behavior of a random graph property. In general,
in the $G_{n,p}$ model we often find that a threshold function $f$ has
these twin properties:
%%
\begin{itemize}
\item For $p$ being just less than $f(n)$, almost no graph has the
  desired property.

\item When $p$ is just greater than $f(n)$, we find that almost all
  graphs exhibit the desired property.
\end{itemize}
%%
The following theorem is a specific example of the above threshold
phenomenon.

\begin{theorem}
\label{thm:probabilistic_method:no_clique_at_least_four_vertices}
Consider the $G_{n,p}$ model with $\varepsilon > 0$ and $p = f(n)$. If
$n$ is sufficiently large and $f(n) = o(n^{-2/3})$, then there is a
probability less than $\varepsilon$ that a random graph chosen from
$G_{n,p}$ has a clique of at least four vertices. Similarly, if $n$ is
sufficiently large and $f(n) = \omega(n^{-2/3})$, then there is a
probability less than $n$ that a random graph chosen from $G_{n,p}$
does not have a clique of at least four vertices.
\end{theorem}

\begin{proof}
First, consider the case where $p = f(n)$ and $f(n) = o(n^{-2/3})$.
Consider an enumeration $C_1, \dots, C_{\binom{n}{4}}$ of all the
subsets of four vertices in $G$. Define a $0$-$1$ random variable
$X_i$ by
\[
X_i
=
\begin{cases}
1, & \text{if $C_i$ is a $4$-clique,} \\
0, & \text{otherwise}
\end{cases}
\]
and write $X = \sum_i X_i$. Then we have
\[
\E[X]
=
\binom{n}{4} p^6
\]
so that $\E[X] = o(1)$, hence $\E[X] < \varepsilon$ for sufficiently
large $n$. As $X$ is a nonnegative integer-valued random variable,
then
\[
\Pr(X \geq 1)
\leq
\E[X]
<
\varepsilon.
\]
In other words, we have a probability of less than $\varepsilon$ that
a random graph chosen from $G_{n,p}$ has a clique of at least four
vertices.

Now consider the case where $p = f(n)$ and $f(n) = \omega(n^{-2/3})$.
As $n \to \infty$ we have $\E[X] \to \infty$. To use
Theorem~\ref{thm:probabilistic_method:upper_bound_nonnegative_integer_rand_var}
to show that $\Pr(X = 0) = o(1)$, we need to show that
$\Var[X] = o\big((\E[X])^2\big)$. Compute the variance of $X$ as
follows. By
Lemma~\ref{lem:probabilistic_method:upper_bound_var_with_expectation_covar},
we need to consider the covariance of the $X_i$.
%%
\begin{enumerate}
\item If $|C_i \cap C_j| = 0$, then the cliques $C_i$ and $C_j$ are
  disjoint and hence independent. It follows from
  Theorem~\ref{thm:moments:expectation_is_multiplicative} that
  $\E[X_i X_j] - \E[X_i] \cdot \E[X_j] = 0$. The same holds for the
  case $|C_i \cap C_j| = 1$.

\item If $|C_i \cap C_j| = 2$, then $C_i$ and $C_j$ share one
  edge. For both cliques to be in the graph, the eleven corresponding
  edges must be in the graph. Then we have
  \[
  \E[X_i X_j] - \E[X_i] \cdot \E[X_j]
  \leq
  \E[X_i X_j]
  \leq
  p^{11}.
  \]
  We want to choose two vertices for $C_i \cap C_j$, two vertices
  exclusively for $C_i$, and two for $C_j$ only. The six vertices can
  be chosen in $\binom{n}{6}$ ways, and there are $\binom{6}{2;2;2}$
  ways to apportion those six vertices into $C_i$ and $C_j$.

\item If $|C_i \cap C_j| = 3$, then $C_i$ and $C_j$ share three
  edges. For both cliques to be in the graph, the nine corresponding
  edges must be in the graph. Then we have
  \[
  \E[X_i X_j] - \E[X_i] \cdot \E[X_j]
  \leq
  \E[X_i X_j]
  \leq
  p^9.
  \]
  The five vertices can be chosen in $\binom{n}{5}$ ways, and we have
  $\binom{5}{3;1;1}$ ways to apportion them to $C_i$ and $C_j$.
\end{enumerate}

Recall that $\E[X] = \binom{n}{4} p^6$ and by hypothesis we have
$p = f(n) = \omega(n^{-2/3})$. Apply the result
\[
\big(\E[X]\big)^2
=
\left(\binom{n}{4} p^6\right)^2
=
\Theta(n^8 p^{12})
\]
to get
%%
\begin{align*}
& \Var[X] \\[4pt]
&\leq
\binom{n}{4} p^6 + \binom{n}{6} \binom{6}{2;2;2} p^{11}
+ \binom{n}{5} \binom{5}{3;1;1} p^9 \\[4pt]
&=
o(n^8 p^{12}) \\[4pt]
&=
o\big((\E[X])^2\big).
\end{align*}
%%
Apply
Theorem~\ref{thm:probabilistic_method:upper_bound_nonnegative_integer_rand_var}
to see that $\Pr(X = 0) = o(1)$.
\end{proof}

Theorem~\ref{thm:probabilistic_method:no_clique_at_least_four_vertices}
can also be proven using the following result.

\begin{theorem}
\textbf{Conditional expectation inequality.}
Consider a collection $X_1, \dots, X_n$ of $0$-$1$ random variables
and write $X = \sum_i X_i$. Then
\[
\Pr(X > 0)
\geq
\sum_i \frac{\Pr(X_i = 1)} {\E[X \mid X_i = 1]}.
\]
\end{theorem}

\begin{proof}
Define a random variable $Y$ by
\[
Y
=
\begin{cases}
1/X, & \text{if $X > 0$,} \\
0, & \text{otherwise}
\end{cases}
\]
to obtain
%%
\begin{align*}
\Pr(X > 0)
&=
\E[XY] \\[4pt]
&=
\E\left[\sum_i X_i Y\right] \\[4pt]
&=
\sum_i \big(
\E[X_i Y \mid X_i = 1] \cdot \Pr(X_i = 1)
+ \E[X_i Y \mid X_i = 0] \cdot \Pr(X_i = 0)
\big) \\[4pt]
&=
\sum_i \E[Y \mid X_i = 1] \cdot \Pr(X_i = 1) \\[4pt]
&=
\sum_i \E[1/X \mid X_i = 1] \cdot \Pr(X_i = 1).
\end{align*}
%%
Apply Jensen's
inequality~(Theorem~\ref{thm:discrete:Jensen_inequality}) with the
convex function $f(x) = 1/x$ to simplify the above chain of equalities
to
\[
\Pr(X > 0)
\geq
\sum_i \frac{\Pr(X_i = 1)} {\E[X \mid X_i = 1]}
\]
as required.
\end{proof}
